Topic Modelling¶

In [2]:
import pandas as pd
df_relevant_articles = pd.read_csv('ai_articles.csv')
In [3]:
df_relevant_articles.dropna(subset=['cleaned_text'], inplace=True)
In [4]:
df_relevant_articles
Out[4]:
url date language title text cleaned_text tokens
0 http://businessnewsthisweek.com/business/infog... 2023-05-20 en Infogain AI Business Solutions Now Available i... \n\nInfogain AI Business Solutions Now Availab... infogain ai business solutions now available i... ['infogain', 'ai', 'business', 'solution', 'av...
1 https://allafrica.com/stories/202504250184.html 2025-04-25 en Africa: AI Policies in Africa - Lessons From G... \nAfrica: AI Policies in Africa - Lessons From... africa ai policies in africa lessons from gha... ['africa', 'ai', 'policy', 'africa', 'lesson',...
2 https://asiatimes.com/2023/07/yang-lan-intervi... 2023-07-25 en Yang Lan interviews academics on AI developmen... \nYang Lan interviews academics on AI developm... yang lan interviews academics on ai developmen... ['yang', 'lan', 'interview', 'academic', 'ai',...
3 https://cdn.meritalk.com/articles/commerce-nom... 2025-02-04 en Commerce Nominee Promises Increased Domestic A... \nCommerce Nominee Promises Increased Domestic... commerce nominee promises increased domestic a... ['commerce', 'nominee', 'promise', 'increased'...
4 https://citylife.capetown/hmn/uncategorized/re... 2023-11-11 en Revolutionizing the Manufacturing Industry: Th... Revolutionizing the Manufacturing Industry:... revolutionizing the manufacturing industry the... ['revolutionizing', 'manufacturing', 'industry...
... ... ... ... ... ... ... ...
157501 https://www.wndu.com/prnewswire/2023/11/15/woo... 2023-11-15 en Woolpert and Allvision Forge Strategic Agreeme... Woolpert and Allvision Forge Strategic Agreeme... woolpert and allvision forge strategic agreeme... ['woolpert', 'allvision', 'forge', 'strategic'...
157502 https://www.wusf.org/2024-05-17/openais-new-ch... 2024-05-17 en OpenAI's new ChatGPT talks and sings. But how ... \nOpenAI's new ChatGPT talks and sings. But ho... openais new chatgpt talks and sings but how hu... ['openais', 'new', 'chatgpt', 'talk', 'sings',...
157503 https://www.wuwf.org/2024-06-03/all-eyes-on-ra... 2024-06-03 en ‘All eyes on Rafah’ is the Internet's most vir... \n‘All eyes on Rafah’ is the Internet's most v... all eyes on rafah is the internets most viral ... ['eye', 'rafah', 'internet', 'viral', 'ai', 't...
157504 https://www.zawya.com/en/press-release/compani... 2024-04-15 en SentinelOne to spotlight Purple AI at GISEC 2024 SentinelOne to spotlight Purple AI at GISEC 20... sentinelone to spotlight purple ai at gisec go... ['sentinelone', 'spotlight', 'purple', 'ai', '...
157505 https://www.zawya.com/en/press-release/governm... 2024-06-13 en DGHR and DCAI join forces to support the world... DGHR and DCAI join forces to support the world... dghr and dcai join forces to support the world... ['dghr', 'dcai', 'join', 'force', 'support', '...

157506 rows × 7 columns

In [5]:
print(df_relevant_articles.shape)
(157506, 7)
In [10]:
# Imports 
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN
import pandas as pd
import plotly.io as pio

#  Load the full dataset
docs = df_relevant_articles['cleaned_text'].astype(str).tolist()

#  Define the embedding model
embedding_model = SentenceTransformer("all-mpnet-base-v2")

#  Custom BERTopic components
umap_model = UMAP(n_components=15, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=25, metric='euclidean', prediction_data=True)
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2), min_df=10)
ctfidf_model = ClassTfidfTransformer()

#  Initialize BERTopic
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    top_n_words=15,
    nr_topics="auto",
    calculate_probabilities=True,
    verbose=True
)

#  Generate embeddings and fit model
embeddings = embedding_model.encode(docs, show_progress_bar=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

# Assign topics back to original dataframe
df_relevant_articles['topic'] = topics

#  Visualize topic summary
fig = topic_model.visualize_barchart(top_n_topics=15)
fig.show()

#  View topic table
topic_model.get_topic_info().head(10)
Batches:   0%|          | 0/4923 [00:00<?, ?it/s]
2025-05-26 17:55:10,325 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-05-26 17:57:28,161 - BERTopic - Dimensionality - Completed ✓
2025-05-26 17:57:28,165 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-05-26 20:42:23,669 - BERTopic - Cluster - Completed ✓
2025-05-26 20:42:23,672 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-05-26 20:44:35,862 - BERTopic - Representation - Completed ✓
2025-05-26 20:44:36,055 - BERTopic - Topic reduction - Reducing number of topics
2025-05-26 20:44:37,480 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-26 20:47:44,840 - BERTopic - Representation - Completed ✓
2025-05-26 20:47:44,888 - BERTopic - Topic reduction - Reduced number of topics from 1047 to 171
Out[10]:
Topic Count Name Representation Representative_Docs
0 -1 57406 -1_ai_laptop_news_new [ai, laptop, news, new, data, technology, medi... [dataloop to supercharge adoption timeline for...
1 0 90058 0_ai_news_new_technology [ai, news, new, technology, data, share, busin... [generative artificial intelligence research r...
2 1 432 1_sports_nfl_wimbledon_ibm [sports, nfl, wimbledon, ibm, players, fans, t... [ibm brings generative ai commentary and ai dr...
3 2 262 2_species_birds_conservation_bird [species, birds, conservation, bird, animals, ... [a new ai tool generates synthetic birdsongs t...
4 3 231 3_beatles_mccartney_song_paul mccartney [beatles, mccartney, song, paul mccartney, pau... [the beatles are releasing their last record a...
5 4 228 4_weather_forecasting_forecasts_climate [weather, forecasting, forecasts, climate, pre... [huaweis panguweather ai model can predict wea...
6 5 218 5_wildfires_smoke_fires_cameras [wildfires, smoke, fires, cameras, wildfire, f... [the threat of wildfires is rising so is new a...
7 6 216 6_ab_data_engineer_experience [ab, data, engineer, experience, management, s... [data scientist itonline home about us busine...
8 7 168 7_data science_data_science_digi [data science, data, science, digi, nv, data s... [switching to data science career guide on tra...
9 8 160 8_nsfw_ai chat_chat_girlfriend [nsfw, ai chat, chat, girlfriend, characters, ... [the best site for nsfw ai chat amp ai girlfri...
In [12]:
topic_info = topic_model.get_topic_info()
print(topic_info)
     Topic  Count                                     Name  \
0       -1  57406                    -1_ai_laptop_news_new   
1        0  90058                 0_ai_news_new_technology   
2        1    432               1_sports_nfl_wimbledon_ibm   
3        2    262        2_species_birds_conservation_bird   
4        3    231  3_beatles_mccartney_song_paul mccartney   
..     ...    ...                                      ...   
166    165     26      165_musicians_album_protest_artists   
167    166     26         166_rabbit_device_pocket_january   
168    167     26            167_cardiac_iv_ai human_valve   
169    168     26         168_fairy_circles_study_patterns   
170    169     26          169_buffett_berkshire_ago_omaha   

                                        Representation  \
0    [ai, laptop, news, new, data, technology, medi...   
1    [ai, news, new, technology, data, share, busin...   
2    [sports, nfl, wimbledon, ibm, players, fans, t...   
3    [species, birds, conservation, bird, animals, ...   
4    [beatles, mccartney, song, paul mccartney, pau...   
..                                                 ...   
166  [musicians, album, protest, artists, bush, cre...   
167  [rabbit, device, pocket, january, ces, ai pin,...   
168  [cardiac, iv, ai human, valve, monitoring, hea...   
169  [fairy, circles, study, patterns, computer vis...   
170  [buffett, berkshire, ago, omaha, warren, warre...   

                                   Representative_Docs  
0    [dataloop to supercharge adoption timeline for...  
1    [generative artificial intelligence research r...  
2    [ibm brings generative ai commentary and ai dr...  
3    [a new ai tool generates synthetic birdsongs t...  
4    [the beatles are releasing their last record a...  
..                                                 ...  
166  [musicians release silent album to protest uk ...  
167  [an indepth explanation of ces s ai sensation ...  
168  [biotronik announced first implant of new impl...  
169  [new discoveries of unexplained fairy circles ...  
170  [buffett shares good news on profits ai though...  

[171 rows x 5 columns]
In [13]:
# Count total topics (excluding -1)
num_topics = df_relevant_articles['topic'].nunique() - (1 if -1 in df_relevant_articles['topic'].values else 0)
print(f"Number of meaningful topics: {num_topics}")
Number of meaningful topics: 170
In [17]:
topic_model = topic_model.reduce_topics(docs, nr_topics=30)
2025-05-26 21:58:19,668 - BERTopic - Topic reduction - Reducing number of topics
2025-05-26 21:58:19,881 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-26 22:03:30,470 - BERTopic - Representation - Completed ✓
2025-05-26 22:03:30,505 - BERTopic - Topic reduction - Reduced number of topics from 171 to 30
In [20]:
df_relevant_articles['reduced_topic'] = topic_model.topics_
In [22]:
topic_info_reduced = topic_model.get_topic_info()
display(topic_info_reduced.head(10))
Topic Count Name Representation Representative_Docs
0 -1 57406 -1_ai_news_new_data [ai, news, new, data, technology, laptop, busi... [meet the gig workers making ai machines more ...
1 0 90992 0_ai_news_new_data [ai, news, new, data, technology, business, sh... [new generative ai study highlights adoption u...
2 1 1374 1_weather_news_ai_new [weather, news, ai, new, public, help, ago, cl... [the threat of wildfires is rising so are new ...
3 2 846 2_ago_news_said_hours ago [ago, news, said, hours ago, hours, public, pe... [how europe is leading the world in building g...
4 3 835 3_sports_nfl_players_game [sports, nfl, players, game, ai, gt, games, pl... [selflearning ai unveils nfl against the sprea...
5 4 802 4_dental_health_ai_clinical [dental, health, ai, clinical, patients, ivf, ... [platinum dental services partners with overje...
6 5 761 5_quantum_materials_nobel_ai [quantum, materials, nobel, ai, physics, compu... [nextgen superconducting diode enhancing ai pe...
7 6 544 6_ai_church_victor_news [ai, church, victor, news, human, religious, t... [toleranceca most australians are worried abo...
8 7 488 7_lg_tv_oled_holographic [lg, tv, oled, holographic, yes, slide, produc... [lg c k oled evo with thinq ai oledcpua lg ...
9 8 436 8_data_data science_science_learning [data, data science, science, learning, ab, da... [how to get started in data science essential ...
In [26]:
fig = topic_model.visualize_barchart(top_n_topics=30)  # You can change 30 to 15 or 20 for simpler view
fig.show()
In [28]:
fig2 = topic_model.visualize_topics()
fig2.show()
In [30]:
topic_info_reduced = topic_model.get_topic_info()
display(topic_info_reduced.head(30))
Topic Count Name Representation Representative_Docs
0 -1 57406 -1_ai_news_new_data [ai, news, new, data, technology, laptop, busi... [meet the gig workers making ai machines more ...
1 0 90992 0_ai_news_new_data [ai, news, new, data, technology, business, sh... [new generative ai study highlights adoption u...
2 1 1374 1_weather_news_ai_new [weather, news, ai, new, public, help, ago, cl... [the threat of wildfires is rising so are new ...
3 2 846 2_ago_news_said_hours ago [ago, news, said, hours ago, hours, public, pe... [how europe is leading the world in building g...
4 3 835 3_sports_nfl_players_game [sports, nfl, players, game, ai, gt, games, pl... [selflearning ai unveils nfl against the sprea...
5 4 802 4_dental_health_ai_clinical [dental, health, ai, clinical, patients, ivf, ... [platinum dental services partners with overje...
6 5 761 5_quantum_materials_nobel_ai [quantum, materials, nobel, ai, physics, compu... [nextgen superconducting diode enhancing ai pe...
7 6 544 6_ai_church_victor_news [ai, church, victor, news, human, religious, t... [toleranceca most australians are worried abo...
8 7 488 7_lg_tv_oled_holographic [lg, tv, oled, holographic, yes, slide, produc... [lg c k oled evo with thinq ai oledcpua lg ...
9 8 436 8_data_data science_science_learning [data, data science, science, learning, ab, da... [how to get started in data science essential ...
10 9 410 9_fashion_ai_news_art [fashion, ai, news, art, anime, new, images, a... [artificial intelligence step out in ai stile...
11 10 353 10_beatles_mccartney_paul mccartney_paul [beatles, mccartney, paul mccartney, paul, son... [the beatles are releasing their final record ...
12 11 329 11_said_county_ago_news [said, county, ago, news, child, hours ago, ho... [opaque ai tool may flag parents with disabili...
13 12 280 12_ago_news_film_voice [ago, news, film, voice, writers, hollywood, h... [could ai pen casablanca screenwriters take ai...
14 13 268 13_shares_etf_ratings_llc [shares, etf, ratings, llc, robotics, analysts... [global x robotics artificial intelligence th...
15 14 188 14_aa_update_suicide_doi [aa, update, suicide, doi, machine, machine le... [safetylit predicting lifetime suicide attempt...
16 15 186 15_openai_altman_news_ago [openai, altman, news, ago, sam, ceo, death, s... [worldcoin scans eyeballs and offers crypto wh...
17 16 170 16_vehicle_vehicles_trailer_chatgpt [vehicle, vehicles, trailer, chatgpt, cars, fo... [ford uses ai to make connecting a trailer as ...
18 17 165 17_antarctic_waves_iceberg_rogue [antarctic, waves, iceberg, rogue, mapping, oc... [a new era of iceberg mapping how artificial i...
19 18 155 18_league_star_win_premier league [league, star, win, premier league, england, e... [ja morants statement sounded like it was writ...
20 19 142 19_republic_kingdom_email_peoples [republic, kingdom, email, peoples, password, ... [hilands convoy passes triway ai richlandsou...
21 20 66 20_patent_inventor_court_roberts [patent, inventor, court, roberts, law, patent... [can an ai system be an inventor full court sa...
22 21 66 21_stocks_tech stocks_companies_fool [stocks, tech stocks, companies, fool, investi... [artificial intelligence ai stocks that could ...
23 22 54 22_dal_salvador_museum_king [dal, salvador, museum, king, radio, npr, inst... [an ai salvador dal will answer any question w...
24 23 36 23_shah_rafah_creator_npr [shah, rafah, creator, npr, instagram, image, ... [all eyes on rafah is the internets most viral...
25 24 35 24_modi_summit_france_macron [modi, summit, france, macron, pm modi, india,... [pm modi to cochair ai summit to open new cons...
26 25 35 25_michael_family_interview_magazine [michael, family, interview, magazine, german,... [michael schumachers family plan legal action ...
27 26 30 26_george_special_comedy_lawsuit [george, special, comedy, lawsuit, estate, def... [george carlin estate sues over fake ai comedy...
28 27 28 27_tupac_drake_dead_song [tupac, drake, dead, song, estate, songs, kend... [it was a classic rap beef then drake revived ...
29 28 26 28_fairy_circles_study_patterns [fairy, circles, study, patterns, computer vis... [new discoveries of unexplained fairy circles ...
In [33]:
reduced_topic_to_industry = {
    0:  "General AI",
    1:  "Energy & Environment",             # weather, public, climate
    2:  "Policy & Governance",             # said, hours ago, public
    3:  "Sports & Events",                 # sports, nfl, players, game
    4:  "Healthcare",                      # dental, health, clinical
    5:  "Science & Research",              # quantum, materials, nobel
    6:  "Ethics & Society",                # church, human, religious
    7:  "Consumer Tech",                   # LG, TV, OLED, holographic
    8:  "Data Science & Education",        # data science, learning
    9:  "Fashion & Art",                   # fashion, images, art
    10: "Media & Entertainment",           # beatles, paul, music
    11: "Public Health",                   # county, child, care
    12: "Film & Storytelling",             # hollywood, film, voice
    13: "Finance & Investment",            # ETF, ratings, shares
    14: "Mental Health & Safety",          # suicide, update, prediction
    15: "OpenAI & Leadership",             # altman, openai, sam
    16: "Auto & Mobility",                 # vehicle, trailer, cars
    17: "Climate & Oceanography",          # iceberg, waves, rogue
    18: "Sports & Culture",                # league, star, win
    19: "Politics & Governance",           # republic, kingdom, people
    20: "Legal & IP",                      # patent, inventor, court
    21: "Finance & Tech Stocks",           # stocks, fool, companies
    22: "Museums & Art",                   # museum, salvador dalí
    23: "Digital Creators & Culture",      # rafah, instagram, viral
    24: "International Politics",          # modi, summit, macron
    25: "Media & Privacy",                 # family, interview, magazine
    26: "Comedy & Legal Issues",           # comedy, lawsuit, george
    27: "Music & IP Rights",               # tupac, song, estate
    28: "Science & Discovery",             # fairy circles, patterns
}
In [35]:
# Apply mapping to assign industries
df_relevant_articles['industry'] = df_relevant_articles['reduced_topic'].map(reduced_topic_to_industry)
In [37]:
industry_counts = df_relevant_articles['industry'].value_counts().reset_index()
industry_counts.columns = ['Industry', 'Article Count']
display(industry_counts)
Industry Article Count
0 General AI 90992
1 Energy & Environment 1374
2 Policy & Governance 846
3 Sports & Events 835
4 Healthcare 802
5 Science & Research 761
6 Ethics & Society 544
7 Consumer Tech 488
8 Data Science & Education 436
9 Fashion & Art 410
10 Media & Entertainment 353
11 Public Health 329
12 Film & Storytelling 280
13 Finance & Investment 268
14 Mental Health & Safety 188
15 OpenAI & Leadership 186
16 Auto & Mobility 170
17 Climate & Oceanography 165
18 Sports & Culture 155
19 Politics & Governance 142
20 Legal & IP 66
21 Finance & Tech Stocks 66
22 Museums & Art 54
23 Digital Creators & Culture 36
24 International Politics 35
25 Media & Privacy 35
26 Comedy & Legal Issues 30
27 Music & IP Rights 28
28 Science & Discovery 26
In [39]:
sample_articles = df_relevant_articles.groupby('industry').apply(
    lambda x: x.sample(1, random_state=42)
)[['cleaned_text', 'industry', 'reduced_topic']].reset_index(drop=True)

display(sample_articles)
/var/folders/59/_xzwcqns6vj35vgt26hxm7qh0000gn/T/ipykernel_3581/2387598375.py:1: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

cleaned_text industry reduced_topic
0 gm explores using chatgpt in vehicles daily m... Auto & Mobility 16
1 mampiasa ai sy fianarana milina ny mpahay sian... Climate & Oceanography 17
2 george carlin estate sues over fake comedy spe... Comedy & Legal Issues 26
3 patsnap to launch first aipowered gpt tool to ... Consumer Tech 7
4 data science interview preparation course to... Data Science & Education 8
5 all eyes on rafah is the internets most viral ... Digital Creators & Culture 23
6 chinas use of ai to operate military satellite... Energy & Environment 1
7 when it comes to religion artificial intellige... Ethics & Society 6
8 hm to use ai to create model doppelgangers ama... Fashion & Art 9
9 voice analysis shows striking similarity betwe... Film & Storytelling 12
10 global x robotics artificial intelligence etf... Finance & Investment 13
11 could this undervalued ai company be canadas n... Finance & Tech Stocks 21
12 kinetica launches generative ai solution for r... General AI 0
13 overjet partners with dental care alliance pro... Healthcare 4
14 modi cochairs actionpacked ai summit in paris ... International Politics 24
15 an indepth analysis of inventorship of ai and ... Legal & IP 20
16 paul mccartney employs ai to help create the b... Media & Entertainment 10
17 german publisher apologises for fake schumache... Media & Privacy 25
18 safetylit road traffic injury prevention the r... Mental Health & Safety 14
19 an ai salvador dal will answer any question wh... Museums & Art 22
20 it was a classic rap beef then drake revived t... Music & IP Rights 27
21 sam altman sam altman wont return as ceo of op... OpenAI & Leadership 15
22 pakistan is stunned as party of imprisoned exp... Policy & Governance 2
23 as social media guardrails fade and ai deepfak... Politics & Governance 19
24 megan thee stallions ai sex tape reignites deb... Public Health 11
25 the mystery of fairy circles expands artificia... Science & Discovery 28
26 ai in chemicals market size projected to be wo... Science & Research 5
27 man united tears slot glory everton progress ... Sports & Culture 18
28 chatgpt ranks the secs best football uniforms ... Sports & Events 3

Custom Classification for General AI¶

In [42]:
industry_keywords = {
    "Healthcare": ["healthcare", "clinical", "hospital", "patient", "biotech", "medical", "treatment", "disease"],
    "Finance": ["bank", "fraud", "fintech", "credit", "investment", "insurance", "loan", "trading", "fund"],
    "Retail & Consumer Goods": ["retail", "customer", "shopping", "store", "ecommerce", "product", "purchase", "brand"],
    "Education": ["education", "student", "university", "classroom", "school", "curriculum", "learning"],
    "Legal & IP": ["legal", "court", "law", "intellectual", "policy", "governance", "regulation", "rights"],
    "Technology & Software": ["software", "platform", "tools", "apps", "development", "automation", "api"],
    "Media & Entertainment": ["music", "film", "movie", "tv", "video", "beatles", "entertainment", "netflix"],
    "Climate & Environment": ["climate", "environment", "carbon", "sustainability", "emissions", "renewable", "energy"],
    "Social Media": ["instagram", "twitter", "facebook", "viral", "followers", "content", "platform"],
    "Government & Politics": ["government", "policy", "election", "minister", "modi", "summit", "president"],
}
In [44]:
def reclassify_general_ai(text):
    text = text.lower()
    for industry, keywords in industry_keywords.items():
        if any(keyword in text for keyword in keywords):
            return industry
    return "General AI"
In [46]:
# Only reclassify articles in topic 0 that are currently labeled General AI
mask = (df_relevant_articles['reduced_topic'] == 0) & (df_relevant_articles['industry'] == "General AI")

df_relevant_articles.loc[mask, 'industry'] = df_relevant_articles.loc[mask, 'cleaned_text'].apply(reclassify_general_ai)
In [48]:
industry_counts_updated = df_relevant_articles['industry'].value_counts().reset_index()
industry_counts_updated.columns = ['Industry', 'Article Count']
display(industry_counts_updated)
Industry Article Count
0 Healthcare 38598
1 Finance 32653
2 Retail & Consumer Goods 14929
3 Education 3411
4 Legal & IP 1794
5 Energy & Environment 1374
6 Policy & Governance 846
7 Sports & Events 835
8 Science & Research 761
9 Ethics & Society 544
10 Consumer Tech 488
11 Media & Entertainment 462
12 Data Science & Education 436
13 Fashion & Art 410
14 Public Health 329
15 Technology & Software 294
16 Film & Storytelling 280
17 Finance & Investment 268
18 Mental Health & Safety 188
19 OpenAI & Leadership 186
20 Auto & Mobility 170
21 Climate & Oceanography 165
22 Sports & Culture 155
23 Politics & Governance 142
24 Finance & Tech Stocks 66
25 Museums & Art 54
26 Digital Creators & Culture 36
27 International Politics 35
28 Media & Privacy 35
29 Comedy & Legal Issues 30
30 General AI 28
31 Music & IP Rights 28
32 Science & Discovery 26
33 Social Media 18
34 Government & Politics 14
35 Climate & Environment 12
In [62]:
final_industry_mapping = {
    'Finance': 'Finance',
    'Healthcare': 'Healthcare',
    'Retail & Consumer Goods': 'Retail & Consumer Goods',
    'Retail & Food': 'Retail & Consumer Goods',
    'Education': 'Education',
    'Education & History': 'Education',
    'Legal': 'Legal',
    'Media & Entertainment': 'Media & Entertainment',
    'Marketing & Advertising': 'Marketing & Advertising',
    'Media & Tech News': 'Media & Entertainment',
    'Media & Policy': 'Media & Entertainment',
    'Entertainment & Performing Arts': 'Media & Entertainment',
    'Technology & Software': 'Technology & Software',
    'Software Development': 'Technology & Software',
    'AI Safety': 'Technology & Software',
    'Energy & Environment': 'Energy & Environment',
    'Climate Science': 'Energy & Environment',
    'Space & Astronomy': 'Energy & Environment',
    'Oceanography & Science': 'Energy & Environment',
    'Social Media': 'Social Media',
    'Social Media Tools': 'Social Media',
    'Publishing & Journalism': 'Culture & Journalism',
    'Museums & Culture': 'Culture & Journalism',
    'Digital Legacy & Ethics': 'Ethics & Society',
    'Surveillance & Location': 'Security & Surveillance',
    'General AI': 'General AI',
    'Elder Care': 'Healthcare',
    'Sports & Events': 'Media & Entertainment',
    'Science & Research': 'Science & Research',  # or its own if needed
    'Politics & Governance': 'Government & Politics',
    'International Politics': 'Government & Politics',
    'Legal & IP': 'Legal',
    'Climate & Oceanography': 'Energy & Environment',
    'Media & Privacy': 'Media & Entertainment',
    'Digital Creators & Culture': 'Media & Entertainment',
    'Fashion & Art': 'Media & Entertainment',
    'OpenAI & Leadership': 'Technology & Software',
    'Music & IP Rights': 'Media & Entertainment',
    'Comedy & Legal Issues': 'Media & Entertainment',
    'Mental Health & Safety': 'Healthcare',
    'Public Health': 'Healthcare',
    'Film & Storytelling': 'Media & Entertainment',
    'Auto & Mobility': 'Technology & Software',
    'Ethics & Society': 'Ethics & Society',
    'Government & Politics': 'Government & Politics',
    'Science & Discovery': 'Energy & Environment',
    'Consumer Tech': 'Technology & Software',
    'Data Science & Education': 'Education'
}
In [64]:
df_relevant_articles['final_industry'] = df_relevant_articles['industry'].map(final_industry_mapping)
In [66]:
final_industry_counts = df_relevant_articles['final_industry'].value_counts().reset_index()
final_industry_counts.columns = ['Final Industry', 'Total Articles']
display(final_industry_counts)
Final Industry Total Articles
0 Healthcare 39115
1 Finance 32653
2 Retail & Consumer Goods 14929
3 Education 3847
4 Media & Entertainment 2116
5 Legal 1794
6 Energy & Environment 1565
7 Technology & Software 1138
8 Science & Research 761
9 Ethics & Society 544
10 Government & Politics 191
11 General AI 28
12 Social Media 18

Plots¶

In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make sure date is datetime type
df_relevant_articles['date'] = pd.to_datetime(df_relevant_articles['date'], errors='coerce')

# Drop rows without valid dates
df_time = df_relevant_articles.dropna(subset=['date'])

# Optional: Choose top N industries for clarity
top_industries = df_time['final_industry'].value_counts().head(6).index.tolist()
df_time = df_time[df_time['final_industry'].isin(top_industries)]

# Group by month and industry
df_time['month'] = df_time['date'].dt.to_period('M').dt.to_timestamp()
df_grouped = df_time.groupby(['month', 'final_industry']).size().reset_index(name='Article Count')

# Plot
plt.figure(figsize=(14, 7))
sns.lineplot(data=df_grouped, x='month', y='Article Count', hue='final_industry', marker='o')
plt.title("AI Article Volume Over Time by Industry", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
To disable this warning, you can either:
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
No description has been provided for this image
In [83]:
df_plot = final_industry_counts.sort_values(by='Total Articles', ascending=True)

# Bar chart
plt.figure(figsize=(12, 8))
sns.barplot(data=df_plot, x='Total Articles', y='Final Industry', palette='RdPu')
plt.title("AI Articles Across Industries", fontsize=16)
plt.xlabel("Number of Articles")
plt.ylabel("Industry")
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [85]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Sort values
df_plot_sorted = final_industry_counts.sort_values(by='Total Articles', ascending=False).reset_index(drop=True)

# Calculate total and percentage
total_articles = df_plot_sorted['Total Articles'].sum()
df_plot_sorted['percentage'] = df_plot_sorted['Total Articles'] / total_articles

# Threshold to group small slices
threshold = 0.01  # 1%
major_industries = df_plot_sorted[df_plot_sorted['percentage'] >= threshold]
others = df_plot_sorted[df_plot_sorted['percentage'] < threshold]
other_sum = others['Total Articles'].sum()

# Combine "Other" if needed
final_pie_data = major_industries[['Final Industry', 'Total Articles']].copy()
if other_sum > 0:
    other_row = pd.DataFrame([{'Final Industry': 'Other', 'Total Articles': other_sum}])
    final_pie_data = pd.concat([final_pie_data, other_row], ignore_index=True)

# Plot
plt.figure(figsize=(10, 10))
colors = sns.color_palette('RdPu', len(final_pie_data))

plt.pie(
    final_pie_data['Total Articles'],
    labels=final_pie_data['Final Industry'],
    autopct='%1.1f%%',
    startangle=140,
    colors=colors,
    wedgeprops={'edgecolor': 'white'},
    textprops={'fontsize': 11}
)

plt.title("AI Article Distribution by Industry", fontsize=16)
plt.tight_layout()
plt.show()
No description has been provided for this image

Temporal Trends Analysis¶

In [90]:
# Ensure 'date' column is datetime
df_relevant_articles['date'] = pd.to_datetime(df_relevant_articles['date'], errors='coerce')

# Drop rows with missing dates
df_time = df_relevant_articles.dropna(subset=['date']).copy()

# Add 'month' column (truncate to month)
df_time['month'] = df_time['date'].dt.to_period('M').dt.to_timestamp()
In [96]:
import seaborn as sns
import matplotlib.pyplot as plt

# Group by month and industry (no filtering now)
monthly_counts_all = df_time.groupby(['month', 'final_industry']).size().reset_index(name='Article Count')

# Plot all industries
plt.figure(figsize=(16, 8))
sns.lineplot(data=monthly_counts_all, x='month', y='Article Count', hue='final_industry', marker='o')
plt.title('Monthly AI Article Volume Across All Industries', fontsize=16)
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.legend(title='Industry', bbox_to_anchor=(1.02, 1), loc='upper left')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [94]:
# Pivot data for stacked area chart
pivot_area = df_top.groupby(['month', 'final_industry']).size().unstack(fill_value=0)

# Plot
pivot_area.plot(kind='area', figsize=(14, 6), cmap='Set2', stacked=True)
plt.title('Share of AI Article Coverage Over Time by Industry')
plt.xlabel('Month')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.legend(title='Industry', bbox_to_anchor=(1.01, 1), loc='upper left')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [105]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert to proper string format for display
heatmap_data.columns = heatmap_data.columns.strftime('%Y-%m')  # Clean month labels

# Plot
plt.figure(figsize=(16, 8))
sns.heatmap(
    heatmap_data,
    cmap="YlGnBu",
    linewidths=0.3,
    linecolor='white',
    cbar_kws={'label': 'Number of Articles'}
)

plt.title("📅 AI Article Volume Heatmap (Industry vs. Month)", fontsize=16)
plt.xlabel("Month")
plt.ylabel("Industry")
plt.xticks(rotation=45, ha='right')  # Tilt for readability
plt.tight_layout()
plt.show()
No description has been provided for this image
In [111]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all cleaned_texts by industry
industry_texts = df_relevant_articles.groupby('final_industry')['cleaned_text'].apply(lambda x: ' '.join(x)).reset_index()

# Plot word clouds for top 6 industries
top_inds = df_relevant_articles['final_industry'].value_counts().head(6).index.tolist()
industry_texts = industry_texts[industry_texts['final_industry'].isin(top_inds)]

# Generate and plot word clouds
for _, row in industry_texts.iterrows():
    wc = WordCloud(width=800, height=400, background_color='white').generate(row['cleaned_text'])
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud: {row['final_industry']}", fontsize=16)
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [112]:
# This shows similarity between topics (c-TF-IDF vectors)
fig = topic_model.visualize_heatmap(top_n_topics=30)
fig.show()
In [115]:
# Shows semantic relationships between topics in 2D space
fig = topic_model.visualize_topics()
fig.show()
In [135]:
# Save classified articles to CSV
df_relevant_articles.to_csv("ai_articles_by_industry.csv", index=False)